This Jupyter notebook contains some visualization examples using plotly. The data set is the Lending Club Loan Data.

Only a subset of features are selected for visualization. For comprehensive visualization analysis, please refer to the other two Jupyter notebooks.

In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
from utils import discrete_plot, numerical_plot

warnings.simplefilter('ignore')
pd.options.display.max_rows = 100
init_notebook_mode(connected=True)
%matplotlib inline

Load Dataset

In [2]:
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

# imbalanced dataset
target1 = train['target'].sum()
target0 = (1 - train['target']).sum()

print('Target 0:\t', target0, '\t', np.round(target0 / len(train), 4))
print('Target 1:\t', target1, '\t', np.round(target1 / len(train), 4))
print('0/1 Ratio:\t', np.round(target0 / target1, 4))
Target 0:	 549951 	 0.7616
Target 1:	 172191 	 0.2384
0/1 Ratio:	 3.1938
In [3]:
# visualize the target count distribution
data = [go.Bar(x=['status 0'], y=[target0], name='Status 0'), 
        go.Bar(x=['status 1'], y=[target1], name='Status 1')]

margin=go.layout.Margin(l=50, r=50, b=30, t=40, pad=4)
legend = dict(orientation='v', xanchor='auto')

layout = go.Layout(title='Loan Status Count Plot', xaxis=dict(title='Loan Status'), 
                   yaxis=dict(title='Count'), autosize=False, width=700, height=400, 
                   margin=margin, legend=legend)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

Visualization

The original data set contains $65$ different features, and a binary label for target. Among all the features, there are $10$ categorical features and $55$ numerical features. However, some numerical features are essentially discrete.

Below, categorical features and discrete features are visualized in same style. And continuous features are visualized in another style.

I. Categorical Features

In [4]:
# term
feature = 'term'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))
In [5]:
# home_ownership
feature = 'home_ownership'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))
In [6]:
# verification_status
feature = 'verification_status'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))
In [7]:
# purpose
feature = 'purpose'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))
In [8]:
# title
feature = 'title'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))
In [9]:
# addr_state
state_count = train.groupby('addr_state')['target'].count().reset_index()
state_count = state_count.sort_values(by='target', ascending=False)

# visualization
scl = [[0.0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'], 
       [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
       [0.8, 'rgb(117,107,177)'], [1.0, 'rgb(84,39,143)']]

data = [dict(type='choropleth', colorscale=scl, autocolorscale=False,
        locations=state_count['addr_state'], z=state_count['target'],
        locationmode='USA-states', colorbar=dict(title='Counts'), 
        marker=dict(line=dict(color = 'rgb(255,255,255)', width=2)))]

geo = dict(scope='usa', projection=dict(type='albers usa'), 
           showlakes=True, lakecolor='rgb(255, 255, 255)')

layout = dict(title='Loan Count Distribution by State', geo=geo, 
              margin=go.Margin(l=50, r=50, b=50, t=40, pad=4), 
              width=1000, height=600)
    
fig = dict(data=data, layout=layout)
iplot(fig)
In [10]:
# addr_state
state_rate = train.groupby('addr_state')['target'].mean().reset_index()
state_rate = state_rate.sort_values(by='target', ascending=False)

# visualization
scl = [[0.0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'], 
       [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
       [0.8, 'rgb(117,107,177)'], [1.0, 'rgb(84,39,143)']]

data = [dict(type='choropleth', colorscale=scl, autocolorscale=False,
        locations=state_rate['addr_state'], z=state_rate['target'],
        locationmode='USA-states', colorbar=dict(title='Default Rate'),
        marker=dict(line=dict(color = 'rgb(255,255,255)', width=2)))]

geo = dict(scope='usa', projection=dict(type='albers usa'), 
           showlakes=True, lakecolor='rgb(255, 255, 255)')

layout = dict(title='Loan Default Rate Distribution by State', geo=geo, 
              margin=go.Margin(l=50, r=50, b=50, t=40, pad=4), 
              width=1000, height=600)
    
fig = dict(data=data, layout=layout)
iplot(fig)
In [11]:
# application_type
feature = 'application_type'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))
In [12]:
# grade
feature = 'grade'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))
In [13]:
# sub_grade
feature = 'sub_grade'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

II. Discrete Features

In [14]:
# emp_length
feature = 'emp_length'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))
In [15]:
# inq_last_6mths
feature = 'inq_last_6mths'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))
In [16]:
# acc_open_past_24mths
feature = 'acc_open_past_24mths'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))
In [17]:
# mths_since_recent_inq
feature = 'mths_since_recent_inq'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))
In [18]:
# num_actv_bc_tl
feature = 'num_actv_bc_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))
In [19]:
# num_actv_rev_tl
feature = 'num_actv_rev_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))
In [23]:
# num_op_rev_tl
feature = 'num_op_rev_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))
In [25]:
# num_rev_tl_bal_gt_0
feature = 'num_rev_tl_bal_gt_0'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))
In [26]:
# num_sats
feature = 'num_sats'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))
In [27]:
# num_tl_op_past_12m
feature = 'num_tl_op_past_12m'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))
In [28]:
# credit_length
feature = 'credit_length'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

III. Continuous Features

In [29]:
# loan_amnt
feature = 'loan_amnt'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))
In [30]:
# int_rate
feature = 'int_rate'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))